library(dplyr)
library(ggplot2)
library(did)
library(readr)

# Load necessary data
load("00_data/01_data_processed/data_epa_analysis.Rdata")

# Preparing the data for modeling

# Set treatment year for analysis
epa_only$treatment_year_reg <- ifelse(epa_only$treatment_region == 0, 0, 1995)
epa_only$treatment_year_sta <- ifelse(epa_only$treatment_state == 0, 0, 1995)
final_panel_data$treatment_year_reg <- ifelse(final_panel_data$treatment_region == 0, 0, 1995)
final_panel_data$treatment_year_sta <- ifelse(final_panel_data$treatment_state == 0, 0, 1995)

# Convert PGM_SYS_ID to a numeric identifier
epa_only$PGM_SYS_ID_num <- as.numeric(as.factor(epa_only$PGM_SYS_ID))
final_panel_data$PGM_SYS_ID_num <- as.numeric(as.factor(final_panel_data$PGM_SYS_ID))

# Define the categorization for each region
region_categories <- c(
  "01" = "Low",     # SITES == 2
  "02" = "Low",     # SITES == 12
  "03" = "Medium",  # SITES == 39
  "04" = "Medium",  # SITES == 72
  "05" = "High",    # SITES == 112
  "07" = "Medium"   # SITES == 23
)

# Define the set of categories to loop over
categories <- c("Low", "Medium", "High")

# Create a container (list) to hold the dynamic effect results for each category
category_results <- list()

# Loop over each category
for (cat in categories) {
  
  # Get the region codes corresponding to the current category
  region_codes <- names(region_categories)[region_categories == cat]
  
  # --- Subset the data for the current category ---
  # Include observations whose EPA_REGION is in the current category OR are never treated (control)
  cat_data <- epa_only %>%
    filter((EPA_REGION %in% region_codes | treatment_year_reg == 0) &
             year < 2002 & year > 1989)
  
  # --- Run the DID analysis on this subset ---
  model_cat <- att_gt(
    yname  = "outcome_epa",
    tname  = "year",
    idname = "PGM_SYS_ID_num",
    gname  = "treatment_year_reg",
    data   = cat_data,
    alp    = 0.01  # 99% confidence level
  )
  
  # --- Get the dynamic treatment effect estimates ---
  aggte_cat <- aggte(model_cat, type = "dynamic", na.rm = TRUE)
  
  # --- Convert to a tidy data frame ---
  cat_df <- ggdid(aggte_cat)$data
  
  category_results[[cat]] <- cat_df
}

# Combine all category-specific results into one data frame
all_categories_df <- do.call(rbind, category_results)
all_categories_df$category <- c(rep("Low",11), rep("Medium", 11), rep("High",11))

# Plot all categories' dynamic estimates together
ggplot(
  all_categories_df,
  aes(
    x = year,
    y = att,
    ymin = att - c * att.se,
    ymax = att + c * att.se,
    color = category
  )
) +
  geom_pointrange(size = .2) +
  geom_line(linetype = 3) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = 4, linetype = "dotted") +
  theme_bw() +
  labs(x = "Years Since Treatment", y = "Estimate (99% conf. band)", color = "EPA Category") +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .08, 0.04)) +
  coord_cartesian(ylim = c(-.25, .08)) +
  theme(
    legend.background = element_rect(
      fill = "white",
      linewidth = 0.2,
      linetype = "solid",
      colour = "black"
    ),
    legend.position = c(0.9, 0.2),
    legend.title = element_blank()
  ) +
  scale_color_discrete(breaks = c('High', 'Medium', 'Low'))


ggsave("03_figures/appendix/figure_a11.pdf", height = 3, width = 8)

rm(list = ls())

                       